package com.zyx.crawlerdemo.webcollector.rediff;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;


/**
 * @author Yaxi.Zhang
 * @since 2021/8/12 13:25
 * desc: 爬取rediff工具类
 */
public class RediffNewsCrawler extends BreadthCrawler {

	private static final StringBuilder STRING_BUILDER = new StringBuilder();
	private static String fileName;
	private static String code;

	public RediffNewsCrawler(String crawlPath, boolean autoParse, String filename, String webCode) {
		super(crawlPath, autoParse);
		// 添加种子URL
		this.addSeed("https://www.rediff.com");
		this.addSeed("https://www.rediff.com/business");

		/*
			URL访问规则添加
				以https://www.rediff.com/为前缀
				以.shtml为后缀
				不匹配以.(jpg|png|gif|css|js|mid|mp4|wav|avi|mov|mpeg|ram|m4v|pdf)结尾的URL
		 */
		this.addRegex("^(https://www.rediff.com/).*(\\.htm)$");
		this.addRegex("-.*\\.(jpg|png|gif|css|js|mid|mp4|wav|avi|mov|mpeg|ram|m4v|pdf)$");

		// 输出文件配置: 文件名以及文件编码
		fileName = filename;
		code = webCode;
	}

	@Override
	public void visit(Page page, CrawlDatums next) {
		String url = page.url();
		// 种子URL,不符合条件,这里过滤掉
		if (page.matchUrl("^(https://www.rediff.com"
				+ "/).*(\\.htm)$")){
			// jsoup解析数据
			String title = page.select("#leftcontainer > h1").text();
			String content = page.select("#arti_content_n").text();
			STRING_BUILDER.append("URL:\t").append(url).append("\n").append("title:\t").append(title).append("\ncontent:\t").append(content).append("\n\n");
		}
		try {
			writeFile(fileName, STRING_BUILDER.toString(), code);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	/**
	 * 数据写入指定文档
	 * 
	 * @param file	文件名
	 * @param content 需要写入的内容
	 * @param code 文件编码
	 */
	public static void writeFile(String file, String content, String code) 
			throws IOException {
		File result = new File(file);
		OutputStream out = new FileOutputStream(result, false);
		BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(out, code));
		bw.write(content);
		bw.close();
		out.close();
	}

	public static void main(String[] args) throws Exception {
		RediffNewsCrawler crawler = new RediffNewsCrawler("rediffNewsCrawler", 
				true,"data/rediffNews.txt","utf-8");
		// 设置线程数目
		crawler.setThreads(5);
		// 设置每一层最多采集的页面数
		crawler.getConf().setTopN(300);
		// 开始数据采集，设置采集的深度
		crawler.start(3);
	}
}